{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "from math import expm1, log1p, sqrt\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score\n",
    "from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn import model_selection\n",
    "from sklearn import ensemble\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "import matplotlib.pylab as plt\n",
    "%matplotlib inline\n",
    "from matplotlib.pylab import rcParams\n",
    "rcParams['figure.figsize'] = 12, 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_folds = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = pd.read_csv(\"train.csv\")\n",
    "test = pd.read_csv(\"test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X.drop('ID', axis=1, inplace=True)\n",
    "test_id = test.pop('ID')\n",
    "X['target'] = X['target'].apply(lambda x: log1p(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y = X.pop('target')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removed `256` Constant Columns\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cols_to_remove = []\n",
    "for col in X.columns:\n",
    "    if X[col].std() == 0: \n",
    "        cols_to_remove.append(col)\n",
    "        \n",
    "# remove constant columns in the training set\n",
    "X.drop(cols_to_remove, axis=1, inplace=True)\n",
    "# remove constant columns in the test set\n",
    "test.drop(cols_to_remove, axis=1, inplace=True)\n",
    "\n",
    "print(\"Removed `{}` Constant Columns\\n\".format(len(cols_to_remove)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removed `4` Duplicate Columns\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cols_to_remove = []\n",
    "cols_scaned = []\n",
    "dups = {}\n",
    "\n",
    "columns = X.columns\n",
    "for i in range(len(columns) - 1):\n",
    "    v = X[columns[i]].values\n",
    "    dup_cols = []\n",
    "    for j in range(i + 1, len(columns)):\n",
    "        if np.array_equal(v, X[columns[j]].values):\n",
    "            cols_to_remove.append(columns[j])\n",
    "            if columns[j] not in cols_scaned:\n",
    "                dup_cols.append(columns[j]) \n",
    "                cols_scaned.append(columns[j])\n",
    "                dups[columns[i]] = dup_cols\n",
    "                \n",
    "# remove duplicate columns in the training set\n",
    "X.drop(cols_to_remove, axis=1, inplace=True) \n",
    "# remove duplicate columns in the testing set\n",
    "test.drop(cols_to_remove, axis=1, inplace=True)\n",
    "\n",
    "print(\"Removed `{}` Duplicate Columns\\n\".format(len(dups)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.5367277914935242\n"
     ]
    }
   ],
   "source": [
    "NUM_OF_FEATURES = 1000\n",
    "def rmsle(y, pred):\n",
    "    return np.sqrt(np.mean(np.power(y - pred, 2)))\n",
    "\n",
    "x1, x2, y1, y2 = model_selection.train_test_split(X, y, test_size=0.20, random_state=5)\n",
    "model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)\n",
    "model.fit(x1, y1)\n",
    "print(rmsle(y2, model.predict(x2)))\n",
    "\n",
    "col = pd.DataFrame({'importance': model.feature_importances_, 'feature': X.columns}).sort_values(\n",
    "    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values\n",
    "X = X[col]\n",
    "test = test[col]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def rmsle_cv(model):\n",
    "    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)\n",
    "    rmse= np.sqrt(-cross_val_score(model, X, y, scoring=\"neg_mean_squared_error\", cv = kf))\n",
    "    return (rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_xgb = xgb.XGBRegressor(colsample_bytree=0.054, gamma=0.4, \n",
    "                             learning_rate=0.01, max_depth=8, \n",
    "                             min_child_weight=5, n_estimators=1000,\n",
    "                             reg_alpha=1e-05, reg_lambda=0.8571,\n",
    "                             subsample=0.6, random_state =7,\n",
    "                             nthread=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=144,\n",
    "                              learning_rate=0.005, n_estimators=720, max_depth=13,\n",
    "                              metric='rmse', is_training_metric=True,\n",
    "                              max_bin = 55, bagging_fraction = 0.8,\n",
    "                              bagging_freq = 5, feature_fraction = 0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):\n",
    "    def __init__(self, models):\n",
    "        self.models = models\n",
    "        \n",
    "    # we define clones of the original models to fit the data in\n",
    "    def fit(self, X, y, eval_metric='rmse'):\n",
    "        self.models_ = [clone(x) for x in self.models]\n",
    "        \n",
    "        # Train cloned base models\n",
    "        for model in self.models_:\n",
    "            model.fit(X, y, eval_metric=eval_metric)\n",
    "\n",
    "        return self\n",
    "    \n",
    "    #Now we do the predictions for cloned models and average them\n",
    "    def predict(self, X):\n",
    "        predictions = np.column_stack([\n",
    "            model.predict(X) for model in self.models_\n",
    "        ])\n",
    "        return np.mean(predictions, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipe = Pipeline([\n",
    "    ('vt', VarianceThreshold(threshold=0.0)),\n",
    "    ('avg-cv', AveragingModels(models = (model_xgb, model_lgb)))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Averaged base models score: 1.4234 (0.0536)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "score = rmsle_cv(pipe)\n",
    "print(\"Averaged base models score: {:.4f} ({:.4f})\\n\".format(score.mean(), score.std()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "submit = pd.DataFrame()\n",
    "submit['ID'] = test_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pipe.fit(X, y, eval_metric='rmse')\n",
    "predictions = pipe.predict(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "submit['target'] = [expm1(x) for x in predictions]\n",
    "submit.to_csv('my_XGB_prediction.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
